# Load Pandas libraries
import warnings
import pandas as pd
import numpy as np
import math
from collections import Counter
# Load scikit-learn library for K-Means
warnings.simplefilter("ignore")
from tslearn.clustering import TimeSeriesKMeans
from sklearn.preprocessing import StandardScaler
# Import Plot libraries
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cbook
conda install -c conda-forge tslearn
# Read raw data
csv_file = 'im_weekly_dpt_pivot_2017_2019_dtw.csv'
data_url = '../data/' + csv_file
rawdata = pd.read_csv(data_url, index_col=['Date'])
print(rawdata)
# Read departments data
dataURL = '../data/col_dpt_list.csv'
dpt_data = pd.read_csv(dataURL)
# Save zone by departments
dpt_zone = {}
for ix, row in dpt_data.iterrows():
dpt = row['department']
zone = row['zone']
dpt_zone[dpt] = zone
dpt_zone
# Split data
date_list = pd.DataFrame(rawdata.index)
data = rawdata.reset_index(drop=True)
col_list = {}
for col in data.columns:
col_list[col] = data[col].max()
# Standardize data
std = False
if std:
for col, max_value in col_list.items():
if max_value > 0:
data[col] = data[col] / max_value
print(data)
# Transform data
x = []
for dpt in list(col_list.keys()):
ts = []
for v in data[dpt].values:
ts.append([v])
x.append(ts)
x = np.array(x)
# Calculate department correlations
corr = pd.DataFrame(data=data.values, columns=list(col_list.keys())).corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
fig, ax1 = plt.subplots(figsize=(18, 18))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 240, n = 9)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask = mask, cmap = cmap, vmin = -1, vmax = 1, center = 0,
square = True, linewidths = .5, cbar_kws = {"shrink": .5})
ax1.set_xticklabels(ax1.get_xticklabels(), rotation = 45, horizontalalignment = 'right');
# Add title
ax1.set_title("Department Correlation Triangle", fontsize = 20)
plt.show()
# Creating training dataset
n_cluster = 5
ds_train = x
Nc = range(1, 20)
# Calculating the Jambu Elbow scores
kmeans = [TimeSeriesKMeans(n_clusters=i, metric="euclidean", max_iter=5, random_state=0) for i in Nc]
score = [kmeans[i].fit(ds_train).inertia_ for i in range(len(kmeans))]
# Plot the results
fig, ax0 = plt.subplots(figsize=(14, 6))
plt.plot(Nc, score, marker='o')
plt.axvline(x=n_cluster, color="#8b0000", linestyle="--")
plt.xticks(np.arange(1, 20, 1))
plt.xlabel("Number of Clusters", fontsize = 12)
plt.ylabel("Score", fontsize = 12)
plt.title("Jambu Elbow Curve", fontsize = 20)
plt.show()
# Calculating the Jambu Elbow scores
kmeans = [TimeSeriesKMeans(n_clusters=i, metric="dtw", max_iter=5, max_iter_barycenter=5, random_state=0) for i in Nc]
score = [kmeans[i].fit(ds_train).inertia_ for i in range(len(kmeans))]
# Plot the results
fig, ax0 = plt.subplots(figsize=(14, 6))
plt.plot(Nc, score, marker='o')
plt.axvline(x=n_cluster, color="#8b0000", linestyle="--")
plt.xticks(np.arange(1, 20, 1))
plt.xlabel("Number of Clusters", fontsize = 12)
plt.ylabel("Score", fontsize = 12)
plt.title("Jambu Elbow Curve", fontsize = 20)
plt.show()
# Calculates the K-Means for (x, y) dataset
def run_kmeans(k_clusters):
#kmeans = TimeSeriesKMeans(n_clusters=k_clusters, metric="euclidean", max_iter=5, random_state=0)
kmeans = TimeSeriesKMeans(n_clusters=k_clusters, metric="dtw", max_iter=5, max_iter_barycenter=5, random_state=0)
kmeans = kmeans.fit(ds_train)
# Getting the cluster labels
clusters = kmeans.predict(ds_train)
# Centroid values
centroids = kmeans.cluster_centers_
return clusters, centroids
# Create interactive control to control k value
clusters, centroids = run_kmeans(k_clusters=n_cluster)
clusters
# CLustering results dataframe
df_result = pd.DataFrame(columns=["n_item", "std_dev", "n_item_2", "w_std_dev"])
# Set color list
color_list = ["#1f77b4", "#ff7f0e", "#d62728", "#2ca02c", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
cluster_count = Counter()
for ix in clusters:
c_name = str(ix + 1)
cluster_count[c_name] += 1
# Cooking dataframe
df = pd.DataFrame.from_records(cluster_count.most_common(), columns = ['cluster', 'frequency']).sort_values(by=['cluster'])
fig, ax = plt.subplots()
df.plot.bar(ax=ax, x='cluster', y='frequency', alpha=0.75, figsize=(10, 7), color=color_list)
plt.title('Departments by Cluster', fontsize=16)
plt.xlabel('Cluster')
plt.ylabel('Frequency')
ax.get_legend().remove()
ax.grid()
plt.xticks(rotation=0)
plt.show()
cluster_data = Counter()
ix = 0
for k, v in dpt_zone.items():
cluster = str(clusters[ix] + 1)
zone = v
if not cluster in cluster_data:
cluster_data[cluster] = Counter()
cluster_data[cluster][zone] += 1
ix += 1
sorted(cluster_data.items())
warnings.filterwarnings("ignore",category=matplotlib.cbook.mplDeprecation)
fig = plt.figure(figsize = (20, 4))
fig.subplots_adjust(hspace = 0.2, wspace = 0.2)
for ix in clusters:
ax = plt.subplot(1, n_cluster, ix + 1)
c_name = str(ix + 1)
data = cluster_data[str(c_name)].most_common()
df = pd.DataFrame.from_records(data, columns = ['zone', 'frequency']).sort_values(by='frequency', ascending=False)
df.plot.bar(ax=ax, x='zone', y='frequency', color=color_list[ix], alpha=0.75)
ax.get_legend().remove()
plt.title('Cluster ' + c_name, fontsize=16)
plt.xlabel('Frequency')
plt.ylabel('Zone')
plt.xticks(rotation=45)
plt.show()
dpt_cluster = {}
for i in range(n_cluster):
cluster_name = (i+1)
dpt_list = []
print('>> Cluster %s:' % cluster_name)
for j in range(len(clusters)):
if i == clusters[j]:
dpt_list.append(list(col_list.keys())[j])
dpt_cluster[cluster_name] = dpt_list
print(' ', dpt_list)
# Read raw data
data_url = "../data/im_weekly_dpt_data.csv"
dptdata = pd.read_csv(data_url, parse_dates=["date"])
dptdata = dptdata.drop(dptdata[(dptdata.department == 'EXTERIOR')].index)
dptdata
# Standardize data
if std:
dptdata['value'] = dptdata['value'].astype(float)
for ix, row in dptdata.iterrows():
max_value = col_list[row['department']]
if max_value > 0:
new_value = row['value'] / max_value
dptdata.at[ix,'value'] = new_value
dpt_ts = {}
for cluster, departments in dpt_cluster.items():
for dpt in departments:
df = dptdata[dptdata['department'] == dpt]
ts = df.set_index('date')['value'].asfreq(freq='W')
dpt_ts[dpt] = ts
# Average a list of time series
def avg_time_series(ts_data, ts_list, allow_dbl, c_name):
avg_ts = None
df = None
n = len(ts_list)
for dpt in ts_list:
ts = ts_data[dpt]
if avg_ts is None:
avg_ts = ts
df = pd.DataFrame(ts)
else:
avg_ts = avg_ts.add(ts, fill_value=0)
df = pd.concat([df, pd.DataFrame(ts)], axis=1, sort=False)
avg_ts = avg_ts / n
if not allow_dbl:
avg_ts = math.ceil(avg_ts)
std_dev = 0
max_value = 0
if len(df.columns) > 1:
for ix, row in df.iterrows():
values = np.nan_to_num(row.values)
max_value = max(np.max(values), max_value)
std_dev += np.std(values)
std_dev = 100.0 * std_dev / len(df) / max_value
print('n time series:', n, ', std dev:', std_dev, 'max value:', max_value)
df_result.loc[c_name - 1] = [n, std_dev, n * n, n * std_dev]
return avg_ts
# Plot trends by year
def plot_cluster_curves(data, dpt_list, c_name, allow_dbl=True):
plt.figure(figsize=(18, 5), dpi=200)
# Plot all curves
for dpt in dpt_list:
ts = data[dpt]
plt.plot(ts, label=dpt, alpha=0.25)
# Plot average curve
avg_ts = avg_time_series(data, dpt_list, allow_dbl, c_name)
plt.plot(avg_ts, label='average', alpha=1, color='black')
plt.title('Cluster %s IM trends' % c_name, fontsize=16)
plt.xlabel('Date', fontsize=10)
plt.ylabel('Num. Deaths', fontsize=10)
plt.xticks(fontsize=10)
plt.legend()
plt.show()
# Plot cluster 1
c_name = 1
plot_cluster_curves(dpt_ts, dpt_cluster[c_name], c_name)
# Plot cluster 2
c_name = 2
plot_cluster_curves(dpt_ts, dpt_cluster[c_name], c_name)
# Plot cluster 3
c_name = 3
plot_cluster_curves(dpt_ts, dpt_cluster[c_name], c_name)
# Plot cluster 4
c_name = 4
plot_cluster_curves(dpt_ts, dpt_cluster[c_name], c_name)
# Plot cluster 5
if n_cluster > 4:
c_name = 5
plot_cluster_curves(dpt_ts, dpt_cluster[c_name], c_name)
# Plot cluster 6
if n_cluster > 5:
c_name = 6
plot_cluster_curves(dpt_ts, dpt_cluster[c_name], c_name)
# Plot cluster 7
if n_cluster > 6:
c_name = 7
plot_cluster_curves(dpt_ts, dpt_cluster[c_name], c_name)
df_result
cluster_avg_item = math.sqrt(np.mean(df_result['n_item_2'].values))
avg_wn_std_dev = np.sum(df_result['w_std_dev'].values) / np.sum(df_result['n_item'].values)
print('Avg Items by Cluster: %0.4f' % cluster_avg_item)
print('Avg WN Std Dev: %0.4f' % avg_wn_std_dev)
End of analysis